Imports and configs¶
In [1]:
import tensorflow as tf
import numpy as np
from collections import deque
import gymnasium as gym
import matplotlib.pyplot as plt
import matplotlib.animation
import time
import pickle
import os
# GPU setup immediately after imports
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
try:
tf.config.experimental.set_virtual_device_configuration(
gpus[0],
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])
except RuntimeError as e:
print(e)
2024-12-06 03:52:16.200622: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2024-12-06 03:52:17.016282: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2024-12-06 03:52:17.395980: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered 2024-12-06 03:52:20.002103: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64:/usr/lib/mesa-diverted/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/mesa:/usr/lib/x86_64-linux-gnu/dri:/usr/lib/x86_64-linux-gnu/gallium-pipe 2024-12-06 03:52:20.002322: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64:/usr/lib/mesa-diverted/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/mesa:/usr/lib/x86_64-linux-gnu/dri:/usr/lib/x86_64-linux-gnu/gallium-pipe 2024-12-06 03:52:20.002346: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
Handling code/VM crash¶
In [2]:
# Function to save training progress
def save_progress(episode, model, replay_buffer, rewards_history, steps_history,
successful_landings, crashes, rolling_reward_avg, times, best_score,
best_weights, start_time, last_print_time):
save_dict = {
'episode': episode,
'replay_buffer': list(replay_buffer), # Convert deque to list for pickling
'rewards_history': rewards_history,
'steps_history': steps_history,
'successful_landings': successful_landings,
'crashes': crashes,
'rolling_reward_avg': rolling_reward_avg,
'times': times,
'best_score': best_score,
'best_weights': best_weights,
'start_time': start_time,
'last_print_time': last_print_time
}
# Save the model weights
model.save_weights('lunar_lander_checkpoint.h5')
# Save other variables
with open('training_progress.pkl', 'wb') as f:
pickle.dump(save_dict, f)
print(f"Progress saved at episode {episode + 1}")
# Check if we're resuming from a checkpoint
resume_training = os.path.exists('training_progress.pkl') and os.path.exists('lunar_lander_checkpoint.h5')
# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)
Initializations¶
In [3]:
# Create the environment
env = gym.make("LunarLander-v3", render_mode="rgb_array")
print("Num GPUs:", len(tf.config.list_physical_devices('GPU')))
print("GPU Available: ", tf.test.is_built_with_cuda())
print("Devices:", tf.config.list_physical_devices())
# ------------------------ 1. Create a simple DQN epsilon policy network with 4 output neurons (one per possible action).
# [Hint: DQN Agents use Epsilon greedy policy] [15 points]
# ------------------------ 2. Discuss the rationale of the activation functions & the loss function used in the network. [10 points]
input_shape = [8] # LunarLander has 8 observations
n_outputs = 4 # LunarLander has 4 possible actions
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation="relu", input_shape=[8]),
tf.keras.layers.Dense(64, activation="relu"),
tf.keras.layers.Dense(n_outputs)
])
# ------------------------ 3. Define the hyperparameters:
# [50 points]
# --------- (i) the number of iterations,
batch_size = 64
# --------- (ii) the number of episodes
n_episodes = 1000
# --------- (iii) the maximum number of steps, and
n_steps = 1000
training_start = 50
training_interval = 4
# --------- (iv) the discount factor at each step
gamma = 0.99
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer)
# Initialize or load training variables
if resume_training:
print("Loading saved progress...")
with open('training_progress.pkl', 'rb') as f:
saved_progress = pickle.load(f)
start_episode = saved_progress['episode'] + 1
replay_buffer = deque(saved_progress['replay_buffer'], maxlen=100000)
rewards_history = saved_progress['rewards_history']
steps_history = saved_progress['steps_history']
successful_landings = saved_progress['successful_landings']
crashes = saved_progress['crashes']
rolling_reward_avg = saved_progress['rolling_reward_avg']
times = saved_progress['times']
best_score = saved_progress['best_score']
best_weights = saved_progress['best_weights']
start_time = saved_progress['start_time']
last_print_time = saved_progress['last_print_time']
model.load_weights('lunar_lander_checkpoint.h5')
print(f"Resuming training from episode {start_episode}")
else:
print("Starting new training...")
start_episode = 0
replay_buffer = deque(maxlen=100000)
rewards_history = []
steps_history = []
successful_landings = []
crashes = []
rolling_reward_avg = []
times = []
best_score = -float('inf')
best_weights = None
start_time = time.time()
last_print_time = start_time
def sample_experiences(batch_size):
indices = np.random.randint(len(replay_buffer), size=batch_size)
batch = [replay_buffer[index] for index in indices]
states, actions, rewards, next_states, dones, truncateds = zip(*batch)
return (np.array(states), np.array(actions), np.array(rewards),
np.array(next_states), np.array(dones), np.array(truncateds))
def epsilon_greedy_policy(state, epsilon=0):
if np.random.rand() < epsilon:
return np.random.randint(n_outputs)
else:
Q_values = model.predict(state[np.newaxis], verbose=0)[0]
return np.argmax(Q_values)
def play_one_step(env, state, epsilon):
action = epsilon_greedy_policy(state, epsilon)
next_state, reward, done, truncated, info = env.step(action)
replay_buffer.append((state, action, reward, next_state, done, truncated))
return next_state, reward, done, truncated, info
def training_step(batch_size, gamma=0.99):
experiences = sample_experiences(batch_size)
states, actions, rewards, next_states, dones, truncateds = experiences
states = tf.cast(states, tf.float32)
next_states = tf.cast(next_states, tf.float32)
rewards = tf.cast(rewards, tf.float32)
next_Q_values = model.predict(next_states, verbose=0)
max_next_Q_values = np.max(next_Q_values, axis=1)
runs = 1.0 - (dones | truncateds)
target_Q_values = rewards + runs * gamma * max_next_Q_values
mask = tf.one_hot(actions, n_outputs, dtype=tf.float32)
with tf.GradientTape() as tape:
all_Q_values = model(states)
Q_values = tf.reduce_sum(all_Q_values * mask, axis=1)
loss = tf.reduce_mean(tf.square(target_Q_values - Q_values))
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
return loss
def get_epsilon(episode, n_episodes):
epsilon_start = 1.0
epsilon_end = 0.01
return max(epsilon_end, epsilon_start - episode * (epsilon_start - epsilon_end) / n_episodes)
Num GPUs: 1 GPU Available: True Devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
2024-12-06 03:52:24.651468: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2024-12-06 03:52:26.795912: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4096 MB memory: -> device: 0, name: NVIDIA L4, pci bus id: 0000:00:03.0, compute capability: 8.9
Loading saved progress... Resuming training from episode 1000
Training¶
In [4]:
# ------------------------ 4. Train the agent on the LunarLander-v2 or v3 environment for a sufficient number of episodes
# to achieve a satisfactory level of performance. [10 points]
print("\nStarting training...\n")
print("Episode Steps Outcome Avg-Reward(20) Time(5ep) Epsilon")
print("-" * 65)
# If resuming training, reconstruct and print the progress table
if resume_training and len(rewards_history) > 0:
print("\nReconstructing progress table from saved data...\n")
previous_episodes = len(rewards_history)
times_len = len(times)
# Initialize last_print_time for accurate time calculations
if times_len > 0:
last_print_time = times[0] - (times[1] - times[0]) if times_len > 1 else start_time
else:
last_print_time = start_time
# Calculate the number of complete 5-episode intervals
num_complete_intervals = previous_episodes // 5
# Adjust the loop to prevent IndexError
for idx in range(min(times_len, num_complete_intervals)):
# Each time corresponds to episode_num = (idx + 1) * 5
episode_num = (idx + 1) * 5
episode_idx = episode_num - 1 # zero-based index
# Check if episode_idx is within the bounds of the data
if episode_idx >= previous_episodes:
break # No more data to process
# Retrieve data for the episode
episode_steps = steps_history[episode_idx]
episode_reward = rewards_history[episode_idx]
avg_reward = rolling_reward_avg[episode_idx] if episode_idx < len(rolling_reward_avg) else np.mean(rewards_history[max(0, episode_idx-19):episode_idx+1])
epsilon = get_epsilon(episode_idx, n_episodes)
current_time = times[idx]
time_for_last_5 = current_time - last_print_time
last_print_time = current_time
final_reward = rewards_history[episode_idx]
# Determine outcome based on success condition
if final_reward >= 200:
outcome = "SUCCESS"
else:
outcome = "CRASH"
log_entry = f"{episode_num:7d} {episode_steps:5d} {outcome:7s} {avg_reward:13.2f} {time_for_last_5:9.2f} {epsilon:.3f}"
print(log_entry)
try:
for episode in range(start_episode, n_episodes):
obs, info = env.reset()
epsilon = get_epsilon(episode, n_episodes)
episode_reward = 0
episode_steps = 0
final_reward = 0
for step in range(n_steps):
obs, reward, done, truncated, info = play_one_step(env, obs, epsilon)
episode_reward += reward
episode_steps += 1
if episode > training_start and step % training_interval == 0:
if len(replay_buffer) >= batch_size:
training_step(batch_size, gamma)
if done or truncated:
final_reward = reward
break
# Update tracking
rewards_history.append(episode_reward)
steps_history.append(episode_steps)
if episode_reward > best_score:
best_score = episode_reward
best_weights = model.get_weights()
# Track outcomes
if final_reward >= 200:
successful_landings.append(episode)
outcome = "SUCCESS"
else:
crashes.append(episode)
outcome = "CRASH"
recent_rewards = rewards_history[-20:] if len(rewards_history) >= 20 else rewards_history
avg_reward = np.mean(recent_rewards)
rolling_reward_avg.append(avg_reward)
# Print progress and save checkpoint
if (episode + 1) % 5 == 0:
current_time = time.time()
time_for_last_5 = current_time - last_print_time
times.append(current_time)
print(f"{episode+1:7d} {episode_steps:5d} {outcome:7s} {avg_reward:13.2f} {time_for_last_5:9.2f} {epsilon:.3f}")
last_print_time = current_time
if (episode + 1) % 25 == 0: # Save every 25 episodes
save_progress(episode, model, replay_buffer, rewards_history, steps_history,
successful_landings, crashes, rolling_reward_avg, times,
best_score, best_weights, start_time, last_print_time)
except KeyboardInterrupt:
print("\nTraining interrupted. Saving progress...")
save_progress(episode, model, replay_buffer, rewards_history, steps_history,
successful_landings, crashes, rolling_reward_avg, times,
best_score, best_weights, start_time, last_print_time)
print("Progress saved. You can resume training by running this script again.")
Starting training...
Episode Steps Outcome Avg-Reward(20) Time(5ep) Epsilon
-----------------------------------------------------------------
Reconstructing progress table from saved data...
5 84 CRASH -266.27 0.21 0.996
10 63 CRASH -197.57 0.21 0.991
15 84 CRASH -209.90 0.19 0.986
20 72 CRASH -181.57 0.42 0.981
25 105 CRASH -156.85 0.34 0.976
30 100 CRASH -164.56 1.18 0.971
35 124 CRASH -131.34 1.19 0.966
40 101 CRASH -170.11 0.84 0.961
45 105 CRASH -199.42 1.38 0.956
50 82 CRASH -210.26 0.64 0.951
55 104 CRASH -217.51 5.51 0.947
60 104 CRASH -222.67 6.69 0.942
65 88 CRASH -185.79 7.96 0.937
70 121 CRASH -176.34 9.60 0.932
75 84 CRASH -183.65 8.26 0.927
80 92 CRASH -169.83 50.85 0.922
85 80 CRASH -200.28 8.40 0.917
90 75 CRASH -205.75 9.27 0.912
95 136 CRASH -217.30 10.10 0.907
100 144 CRASH -208.08 12.44 0.902
105 83 CRASH -197.87 10.01 0.897
110 57 CRASH -188.50 12.06 0.892
115 139 CRASH -170.83 10.74 0.887
120 109 CRASH -167.26 10.62 0.882
125 130 CRASH -168.44 9.55 0.877
130 79 CRASH -186.01 10.17 0.872
135 105 CRASH -190.65 9.27 0.867
140 115 CRASH -185.65 9.99 0.862
145 75 CRASH -171.23 11.42 0.857
150 114 CRASH -182.24 10.72 0.852
155 82 CRASH -194.78 11.84 0.848
160 100 CRASH -207.93 8.50 0.843
165 72 CRASH -195.95 11.58 0.838
170 88 CRASH -162.13 12.28 0.833
175 81 CRASH -145.02 9.22 0.828
180 178 CRASH -122.24 9.84 0.823
185 140 CRASH -139.40 15.75 0.818
190 92 CRASH -157.00 15.02 0.813
195 113 CRASH -181.77 10.59 0.808
200 122 CRASH -204.45 11.01 0.803
205 101 CRASH -216.89 18.19 0.798
210 132 CRASH -218.43 14.79 0.793
215 116 CRASH -198.76 17.81 0.788
220 87 CRASH -199.51 15.34 0.783
225 106 CRASH -185.98 14.61 0.778
230 80 CRASH -173.35 16.83 0.773
235 74 CRASH -171.05 14.88 0.768
240 120 CRASH -163.63 13.15 0.763
245 125 CRASH -181.66 16.26 0.758
250 134 CRASH -201.21 15.41 0.753
255 88 CRASH -220.77 17.33 0.749
260 89 CRASH -208.02 15.16 0.744
265 81 CRASH -183.12 14.65 0.739
270 92 CRASH -153.28 14.06 0.734
275 147 CRASH -128.49 16.12 0.729
280 331 CRASH -139.31 14.99 0.724
285 115 CRASH -139.56 22.91 0.719
290 109 CRASH -135.00 17.19 0.714
295 94 CRASH -127.97 41.17 0.709
300 104 CRASH -122.85 16.92 0.704
305 105 CRASH -119.49 16.97 0.699
310 153 CRASH -129.36 16.09 0.694
315 137 CRASH -175.33 21.51 0.689
320 100 CRASH -178.07 19.25 0.684
325 87 CRASH -181.68 23.70 0.679
330 110 CRASH -176.85 16.35 0.674
335 155 CRASH -136.21 20.49 0.669
340 77 CRASH -133.84 31.01 0.664
345 107 CRASH -128.55 22.94 0.659
350 183 CRASH -132.57 18.64 0.654
355 183 CRASH -122.89 22.72 0.650
360 220 CRASH -115.11 26.69 0.645
365 114 CRASH -106.84 26.51 0.640
370 144 CRASH -96.20 19.74 0.635
375 172 CRASH -94.91 29.18 0.630
380 178 CRASH -88.17 24.64 0.625
385 105 CRASH -98.76 23.98 0.620
390 169 CRASH -96.22 19.58 0.615
395 156 CRASH -93.03 20.82 0.610
400 210 CRASH -94.48 25.08 0.605
405 109 CRASH -83.73 27.28 0.600
410 144 CRASH -83.68 22.78 0.595
415 262 CRASH -103.65 24.92 0.590
420 188 CRASH -101.29 33.58 0.585
425 85 CRASH -97.88 24.35 0.580
430 111 CRASH -101.54 31.17 0.575
435 324 CRASH -101.07 58.44 0.570
440 239 CRASH -103.86 31.32 0.565
445 143 CRASH -92.38 27.62 0.560
450 147 CRASH -99.69 26.53 0.555
455 179 CRASH -84.90 30.22 0.551
460 90 CRASH -86.56 35.42 0.546
465 201 CRASH -104.66 27.57 0.541
470 116 CRASH -100.21 24.71 0.536
475 270 CRASH -103.19 24.39 0.531
480 226 CRASH -106.53 1639.60 0.526
485 163 CRASH -101.41 51.75 0.521
490 309 CRASH -119.64 38.55 0.516
495 401 CRASH -143.84 72.81 0.511
500 229 CRASH -142.94 50.19 0.506
505 268 CRASH -151.54 46.77 0.501
510 210 CRASH -144.94 47.21 0.496
515 429 CRASH -136.41 47.14 0.491
520 359 CRASH -158.75 69.06 0.486
525 233 CRASH -145.00 49.84 0.481
530 396 CRASH -166.57 87.87 0.476
535 256 CRASH -167.56 81.22 0.471
540 303 CRASH -158.09 57.13 0.466
545 101 CRASH -164.81 51.04 0.461
550 383 CRASH -142.63 44.32 0.456
555 74 CRASH -128.23 89.75 0.452
560 112 CRASH -112.33 66.44 0.447
565 315 CRASH -107.91 34.88 0.442
570 231 CRASH -95.05 45.53 0.437
575 312 CRASH -90.60 60.01 0.432
580 141 CRASH -92.03 60.96 0.427
585 329 CRASH -93.57 45.33 0.422
590 363 CRASH -90.46 67.75 0.417
595 290 CRASH -88.30 69.82 0.412
600 163 CRASH -80.28 58.01 0.407
605 205 CRASH -82.51 87.83 0.402
610 314 CRASH -87.75 64.39 0.397
615 293 CRASH -82.16 132.34 0.392
620 695 CRASH -84.02 90.38 0.387
625 247 CRASH -68.75 83.06 0.382
630 263 CRASH -49.70 84.46 0.377
635 298 CRASH -40.37 101.48 0.372
640 370 CRASH -44.19 88.37 0.367
645 166 CRASH -36.42 150.09 0.362
650 1000 CRASH -39.45 70.69 0.357
655 289 CRASH -50.90 1287.99 0.353
660 522 CRASH -34.48 149.06 0.348
665 140 CRASH -43.85 125.57 0.343
670 884 CRASH -61.39 67.85 0.338
675 773 CRASH -59.79 201.29 0.333
680 1000 CRASH -60.74 212.71 0.328
685 577 CRASH -57.21 215.41 0.323
690 129 CRASH -39.11 98.30 0.318
695 1000 CRASH -45.10 145.89 0.313
700 669 CRASH -53.00 145.18 0.308
705 1000 CRASH -55.90 255.22 0.303
710 1000 CRASH -56.61 274.75 0.298
715 1000 CRASH -42.90 218.48 0.293
720 1000 CRASH -40.63 267.23 0.288
725 1000 CRASH -39.28 233.35 0.283
730 1000 CRASH -48.44 2269.60 0.278
735 592 CRASH -52.45 276.45 0.273
740 1000 CRASH -41.03 251.22 0.268
745 1000 CRASH -40.71 2102.27 0.263
750 1000 CRASH -18.74 257.51 0.258
755 767 CRASH -11.29 280.59 0.254
760 1000 CRASH -11.53 240.87 0.249
765 1000 CRASH -0.45 279.94 0.244
770 1000 CRASH -7.39 268.41 0.239
775 852 CRASH -4.98 238.69 0.234
780 1000 CRASH -8.23 2077.79 0.229
785 1000 CRASH -12.16 289.84 0.224
790 1000 CRASH -9.23 247.55 0.219
795 1000 CRASH -6.98 288.94 0.214
800 352 CRASH -10.61 283.22 0.209
805 929 CRASH -0.89 204.79 0.204
810 302 CRASH -5.22 238.03 0.199
815 1000 CRASH -3.31 224.14 0.194
820 1000 CRASH 19.97 265.62 0.189
825 1000 CRASH 10.21 275.53 0.184
830 467 SUCCESS 43.93 3073.26 0.179
835 771 CRASH 50.69 170.22 0.174
840 1000 CRASH 15.34 248.30 0.169
845 1000 CRASH 0.84 193.13 0.164
850 1000 CRASH -45.64 259.73 0.159
855 172 CRASH -76.35 258.47 0.155
860 1000 CRASH -67.14 258.48 0.150
865 1000 CRASH -54.51 301.89 0.145
870 1000 CRASH -54.38 215.18 0.140
875 823 CRASH -48.69 266.22 0.135
880 1000 CRASH -53.05 1156.19 0.130
885 176 CRASH -67.24 312.10 0.125
890 1000 CRASH -52.93 262.44 0.120
895 234 CRASH -62.09 232.10 0.115
900 295 CRASH -61.73 166.67 0.110
905 301 CRASH -63.73 219.96 0.105
910 1000 CRASH -80.45 228.56 0.100
915 1000 CRASH -68.07 319.72 0.095
920 248 CRASH -55.86 275.78 0.090
925 1000 CRASH -32.34 269.67 0.085
930 1000 CRASH 9.40 552.71 0.080
935 181 CRASH 42.35 307.61 0.075
940 1000 CRASH 35.32 200.61 0.070
945 1000 CRASH 43.81 331.32 0.065
950 1000 CRASH 25.92 322.58 0.060
955 1000 CRASH -2.59 3076.69 0.056
960 1000 CRASH -0.62 452.48 0.051
965 681 SUCCESS 3.85 380.84 0.046
970 1000 CRASH 13.88 433.52 0.041
975 958 CRASH 21.40 362.63 0.036
980 1000 CRASH 40.59 1629.61 0.031
985 1000 CRASH 30.60 325.45 0.026
990 1000 CRASH 26.49 320.70 0.021
995 870 CRASH 33.18 272.47 0.016
1000 1000 CRASH 14.84 327.51 0.011
Analysis¶
In [5]:
# ------------------------ 5. Analyze the agent's learning progress by plotting relevant performance metrics
# (e.g., cumulative rewards, episode length) over time. [10 points]
# Restore best weights
model.set_weights(best_weights)
# Final Analysis
def calculate_times_with_interpolation(timestamps):
# Calculate the time differences
times = [timestamps[1] - timestamps[0]] # Assume the first difference equals the second
times += [timestamps[i] - timestamps[i - 1] for i in range(1, len(timestamps))]
# Identify indices with times > 1000
for i in range(len(times)):
if times[i] > 1000:
# Interpolate using previous and next values
prev_time = times[i - 1] if i > 0 else 0
next_time = times[i + 1] if i + 1 < len(times) else 0
times[i] = (prev_time + next_time) / 2
return times
# Calculate corrected times
corrected_times = calculate_times_with_interpolation(times)
import datetime
total_time = np.sum(corrected_times)
total_time = str(datetime.timedelta(seconds=int(total_time)))
print("\nTraining Summary:")
print(f"Total training time: {total_time}")
print(f"Episodes hitting max steps: {sum(1 for s in steps_history if s >= 999)}/{n_episodes}")
print(f"Successful landings: {len(successful_landings)}/{n_episodes} ({len(successful_landings)/n_episodes*100:.1f}%)")
print(f"Crashes: {len(crashes)}/{n_episodes} ({len(crashes)/n_episodes*100:.1f}%)")
print(f"Best score achieved: {best_score:.2f}")
# Plot training results
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8, 10))
# Font size configuration
font_title = 14
font_label = 12
font_tick = 10
font_legend = 11
ax1.plot(rewards_history, alpha=0.6, label='Episode Reward')
ax1.plot(rolling_reward_avg, label='20-Episode Average', linewidth=2)
ax1.set_title('Training Progress - Rewards', fontsize=font_title)
ax1.set_xlabel('Episode', fontsize=font_label)
ax1.set_ylabel('Total Reward', fontsize=font_label)
ax1.tick_params(axis='both', labelsize=font_tick)
ax1.grid(True)
ax1.legend(fontsize=font_legend)
episodes = range(1, len(rewards_history) + 1)
ax2.scatter(successful_landings, [1]*len(successful_landings), color='green', label='Successful Landings', alpha=0.6)
ax2.scatter(crashes, [0]*len(crashes), color='red', label='Crashes', alpha=0.6)
ax2.set_title('Landing Outcomes', fontsize=font_title)
ax2.set_xlabel('Episode', fontsize=font_label)
ax2.set_yticks([0, 1])
ax2.set_yticklabels(['Crash', 'Success'], fontsize=font_tick)
ax2.tick_params(axis='both', labelsize=font_tick)
ax2.grid(True)
ax2.legend(fontsize=font_legend)
# Correct x-axis to match every 5 episodes, ending at 1000
x_axis_corrected = [i * 5 for i in range(len(corrected_times))]
ax3.plot(x_axis_corrected, corrected_times, color='blue', linewidth=2, label='Corrected Times')
ax3.set_title('Episode Times', fontsize=font_title)
ax3.set_xlabel('Episode', fontsize=font_label) # Updated label to "Episode"
ax3.set_ylabel('Time (s)', fontsize=font_label)
ax3.tick_params(axis='both', labelsize=font_tick)
ax3.grid(True)
ax3.legend(fontsize=font_legend)
plt.tight_layout()
plt.show()
Training Summary: Total training time: 6:12:25 Episodes hitting max steps: 229/1000 Successful landings: 15/1000 (1.5%) Crashes: 985/1000 (98.5%) Best score achieved: 312.91
Animations¶
In [6]:
def plot_animation(frames, repeat=False, interval=40):
plt.figure()
patch = plt.imshow(frames[0])
plt.axis('off')
anim = matplotlib.animation.FuncAnimation(
plt.gcf(), lambda i: patch.set_data(frames[i]),
frames=len(frames), repeat=repeat, interval=interval)
from IPython.display import HTML
return HTML(anim.to_jshtml()) # This line is key for Jupyter display
def show_video(env, model, max_steps=2000):
frames = []
obs, _ = env.reset()
total_reward = 0
total_steps = 0
for step in range(max_steps):
frames.append(env.render())
action = epsilon_greedy_policy(obs, epsilon=0)
obs, reward, done, truncated, _ = env.step(action)
total_reward += reward
total_steps += 1
if done or truncated:
print("Landing status:", "Success!" if reward >= 200 else "Crash!")
print(f"Total reward: {total_reward:.2f}")
print(f"Total steps: {total_steps}")
break
env.close()
return plot_animation(frames)
show_video(env, model)
2024-12-06 03:52:31.448371: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
Landing status: Crash! Total reward: -60.75 Total steps: 1000
Out[6]: